In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# CLUSTERING
# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install matplotlib;
! pip install scipy;
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree
from sklearn.preprocessing import StandardScaler
In [5]:
# 0. Review of principal components – another unsupervised learning method
# Load the dataset
url = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/USArrests.csv"
USArrests = pd.read_csv(url, index_col=0)
# Standardize data
X = StandardScaler().fit_transform(USArrests)
features = USArrests.columns # Get column names for the features
# Add state names as row labels (in this case, USArrests' index has state names)
state_names = USArrests.index
# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# Plot the PCA components
plt.figure(figsize=(12, 9))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Biplot with State Labels and Enhanced Arrows")
# Add labels for each state
for i, state in enumerate(state_names):
plt.text(X_pca[i, 0], X_pca[i, 1], state, ha='right', color='blue', fontsize=8)
# Plot arrows (loadings) for each feature
for i, feature in enumerate(features):
plt.arrow(0, 0,
pca.components_[0, i] * 2, # Adjust 2 to make arrows longer or shorter
pca.components_[1, i] * 2,
color='red',
width=0.02, # Thickness of the arrow
head_width=0.1) # Width of the arrowhead
plt.text(pca.components_[0, i] * 2.2, pca.components_[1, i] * 2.2,
feature, color='red', ha='center', va='center')
plt.grid()
plt.show()
In [7]:
# 1. K-means method
# K-means clustering with K=2
kmeans_2 = KMeans(n_clusters=2, random_state=42)
clusters_2 = kmeans_2.fit_predict(X_scaled)
# Cluster means
print('Cluster means:\n', kmeans_2.cluster_centers_)
# Clustering vector
USArrests['Cluster'] = clusters_2
print(USArrests[['Murder', 'Assault', 'UrbanPop', 'Rape', 'Cluster']])
# Plot K-means clusters on biplot
plt.scatter(pc[:, 0] * 3.5, pc[:, 1] * 3.5, c=clusters_2, s=100, alpha=0.5)
plt.title('K-means Clusters (K=2)')
plt.show()
# K-means with K=5
kmeans_5 = KMeans(n_clusters=5, random_state=42)
clusters_5 = kmeans_5.fit_predict(X_scaled)
# Plot K-means clusters on biplot for K=5
plt.scatter(pc[:, 0] * 3.5, pc[:, 1] * 3.5, c=clusters_5, s=100, alpha=0.5)
plt.title('K-means Clusters (K=5)')
plt.show()
Cluster means: [[-0.67675778 -0.68274685 -0.13306084 -0.57037591] [ 1.01513667 1.02412028 0.19959126 0.85556386]] Murder Assault UrbanPop Rape Cluster rownames Alabama 13.2 236 58 21.2 1 Alaska 10.0 263 48 44.5 1 Arizona 8.1 294 80 31.0 1 Arkansas 8.8 190 50 19.5 0 California 9.0 276 91 40.6 1 Colorado 7.9 204 78 38.7 1 Connecticut 3.3 110 77 11.1 0 Delaware 5.9 238 72 15.8 0 Florida 15.4 335 80 31.9 1 Georgia 17.4 211 60 25.8 1 Hawaii 5.3 46 83 20.2 0 Idaho 2.6 120 54 14.2 0 Illinois 10.4 249 83 24.0 1 Indiana 7.2 113 65 21.0 0 Iowa 2.2 56 57 11.3 0 Kansas 6.0 115 66 18.0 0 Kentucky 9.7 109 52 16.3 0 Louisiana 15.4 249 66 22.2 1 Maine 2.1 83 51 7.8 0 Maryland 11.3 300 67 27.8 1 Massachusetts 4.4 149 85 16.3 0 Michigan 12.1 255 74 35.1 1 Minnesota 2.7 72 66 14.9 0 Mississippi 16.1 259 44 17.1 1 Missouri 9.0 178 70 28.2 1 Montana 6.0 109 53 16.4 0 Nebraska 4.3 102 62 16.5 0 Nevada 12.2 252 81 46.0 1 New Hampshire 2.1 57 56 9.5 0 New Jersey 7.4 159 89 18.8 0 New Mexico 11.4 285 70 32.1 1 New York 11.1 254 86 26.1 1 North Carolina 13.0 337 45 16.1 1 North Dakota 0.8 45 44 7.3 0 Ohio 7.3 120 75 21.4 0 Oklahoma 6.6 151 68 20.0 0 Oregon 4.9 159 67 29.3 0 Pennsylvania 6.3 106 72 14.9 0 Rhode Island 3.4 174 87 8.3 0 South Carolina 14.4 279 48 22.5 1 South Dakota 3.8 86 45 12.8 0 Tennessee 13.2 188 59 26.9 1 Texas 12.7 201 80 25.5 1 Utah 3.2 120 80 22.9 0 Vermont 2.2 48 32 11.2 0 Virginia 8.5 156 63 20.7 0 Washington 4.0 145 73 26.2 0 West Virginia 5.7 81 39 9.3 0 Wisconsin 2.6 53 66 10.8 0 Wyoming 6.8 161 60 15.6 0
C:\Users\baron\AppData\Local\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1446: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
C:\Users\baron\AppData\Local\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1446: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
In [9]:
# 2. Hierarchical Clustering and Dendrogram
HC = linkage(X_scaled, method='complete')
plt.figure(figsize=(10, 8))
dendrogram(HC, labels=USArrests.index)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('States')
plt.ylabel('Distance')
plt.show()
# Cutting the dendrogram to create clusters
clusters_hc = cut_tree(HC, n_clusters=5)
USArrests['HC_Cluster'] = clusters_hc.flatten()
print(USArrests[['Murder', 'Assault', 'UrbanPop', 'Rape', 'HC_Cluster']])
Murder Assault UrbanPop Rape HC_Cluster rownames Alabama 13.2 236 58 21.2 0 Alaska 10.0 263 48 44.5 1 Arizona 8.1 294 80 31.0 2 Arkansas 8.8 190 50 19.5 3 California 9.0 276 91 40.6 2 Colorado 7.9 204 78 38.7 2 Connecticut 3.3 110 77 11.1 3 Delaware 5.9 238 72 15.8 3 Florida 15.4 335 80 31.9 2 Georgia 17.4 211 60 25.8 0 Hawaii 5.3 46 83 20.2 3 Idaho 2.6 120 54 14.2 4 Illinois 10.4 249 83 24.0 2 Indiana 7.2 113 65 21.0 3 Iowa 2.2 56 57 11.3 4 Kansas 6.0 115 66 18.0 3 Kentucky 9.7 109 52 16.3 3 Louisiana 15.4 249 66 22.2 0 Maine 2.1 83 51 7.8 4 Maryland 11.3 300 67 27.8 2 Massachusetts 4.4 149 85 16.3 3 Michigan 12.1 255 74 35.1 2 Minnesota 2.7 72 66 14.9 3 Mississippi 16.1 259 44 17.1 0 Missouri 9.0 178 70 28.2 3 Montana 6.0 109 53 16.4 4 Nebraska 4.3 102 62 16.5 4 Nevada 12.2 252 81 46.0 2 New Hampshire 2.1 57 56 9.5 4 New Jersey 7.4 159 89 18.8 3 New Mexico 11.4 285 70 32.1 2 New York 11.1 254 86 26.1 2 North Carolina 13.0 337 45 16.1 0 North Dakota 0.8 45 44 7.3 4 Ohio 7.3 120 75 21.4 3 Oklahoma 6.6 151 68 20.0 3 Oregon 4.9 159 67 29.3 3 Pennsylvania 6.3 106 72 14.9 3 Rhode Island 3.4 174 87 8.3 3 South Carolina 14.4 279 48 22.5 0 South Dakota 3.8 86 45 12.8 4 Tennessee 13.2 188 59 26.9 0 Texas 12.7 201 80 25.5 2 Utah 3.2 120 80 22.9 3 Vermont 2.2 48 32 11.2 4 Virginia 8.5 156 63 20.7 3 Washington 4.0 145 73 26.2 3 West Virginia 5.7 81 39 9.3 4 Wisconsin 2.6 53 66 10.8 3 Wyoming 6.8 161 60 15.6 3
In [ ]:
# 3. College data - K-means method
! pip install ISLP;
from ISLP import load_data
# Load the Auto dataset from package ISLP
College = load_data('College')
In [17]:
# Create a matrix of numeric variables
X_college = College.select_dtypes(include=[np.number])
print(X_college.shape) # Check dimensions
# K-means with K=5 for college data
kmeans_college = KMeans(n_clusters=5, random_state=42)
clusters_college = kmeans_college.fit_predict(X_college)
# Cluster means
print('Cluster means for College:\n', kmeans_college.cluster_centers_)
# Clustering vector
College['Cluster'] = clusters_college
print(College[['Cluster']].head())
# Plot pairs of variables with assigned clusters
plt.figure(figsize=(12, 12))
plt.subplot(2, 2, 1)
plt.scatter(College['Outstate'], College['Top10perc'], c=clusters_college)
plt.title('Outstate vs Top10perc')
plt.subplot(2, 2, 2)
plt.scatter(College['S.F.Ratio'], College['PhD'], c=clusters_college)
plt.title('S.F.Ratio vs PhD')
plt.subplot(2, 2, 3)
plt.scatter(College['Apps'], College['Enroll'], c=clusters_college)
plt.title('Apps vs Enroll')
plt.subplot(2, 2, 4)
plt.scatter(College['Room.Board'], College['Private'], c=clusters_college)
plt.title('Room.Board vs Private')
plt.tight_layout()
plt.show()
(777, 17) Cluster means for College: [[1.18157500e+03 8.98534091e+02 3.55340909e+02 2.12704545e+01 4.85340909e+01 1.49989773e+03 4.97884091e+02 9.18096364e+03 4.10183182e+03 5.29795455e+02 1.31207273e+03 6.50954545e+01 7.27500000e+01 1.44000000e+01 2.11704545e+01 7.63585227e+03 6.27045455e+01] [1.50671143e+04 1.00431714e+04 3.84400000e+03 4.15428571e+01 7.40285714e+01 1.96352571e+04 3.57811429e+03 9.46885714e+03 4.57285714e+03 6.03000000e+02 1.75028571e+03 8.67142857e+01 9.19428571e+01 1.57485714e+01 1.76000000e+01 1.11875143e+04 6.57428571e+01] [2.62470732e+03 1.72104268e+03 5.27079268e+02 4.00487805e+01 6.91463415e+01 2.09468902e+03 2.83243902e+02 1.57379268e+04 5.25655488e+03 5.78481707e+02 1.04590854e+03 8.34207317e+01 9.02682927e+01 1.14146341e+01 3.27012195e+01 1.37479878e+04 7.68414634e+01] [5.78270000e+03 3.98210000e+03 1.71863333e+03 2.22333333e+01 5.34583333e+01 9.13747500e+03 2.23692500e+03 6.96022500e+03 3.76000000e+03 5.61841667e+02 1.73805833e+03 7.84916667e+01 8.46583333e+01 1.72633333e+01 1.43833333e+01 7.45545000e+03 5.63416667e+01] [8.92527778e+03 3.42455556e+03 1.24844444e+03 7.57777778e+01 9.18333333e+01 4.86738889e+03 2.98944444e+02 1.80621667e+04 5.98150000e+03 5.75666667e+02 1.27838889e+03 9.33333333e+01 9.65555556e+01 6.49444444e+00 3.62222222e+01 3.36272778e+04 8.95000000e+01]] Cluster 0 0 1 2 2 0 3 2 4 0
In [19]:
# 4. College data - Hierarchical Clustering
HC_college = linkage(X_college, method='complete')
plt.figure(figsize=(10, 8))
dendrogram(HC_college, labels=College.index)
plt.title('Hierarchical Clustering Dendrogram (College)')
plt.xlabel('Colleges')
plt.ylabel('Distance')
plt.show()
# Random sample for hierarchical clustering
Z = np.random.choice(College.index, 20, replace=False)
Y = X_college.loc[Z]
HCZ = linkage(Y, method='complete')
plt.figure(figsize=(10, 8))
dendrogram(HCZ, labels=Y.index)
plt.title('Hierarchical Clustering Dendrogram (Sample Colleges)')
plt.xlabel('Colleges')
plt.ylabel('Distance')
plt.show()
# Create clusters from the hierarchical clustering
HC4 = cut_tree(HC_college, n_clusters=4)
College['HC_Cluster'] = HC4.flatten()
print(College[['HC_Cluster']].head())
HC_Cluster 0 0 1 0 2 0 3 0 4 0
In [ ]: